library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(here)
## here() starts at /Users/itsjoeoui/Developer/McGill/MATH208
HTRU2 <- read_csv(here("HTRU2/HTRU_2.csv"), col_names = FALSE)
## Rows: 17898 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): X1, X2, X3, X4, X5, X6, X7, X8, X9
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(HTRU2) = c("Mean_IP", "SD_IP", "EK_IP", "SKW_IP",
"Mean_DMSNR", "SD_DMSNR", "EK_DMSNR", "SKW_DMSNR","Class")
head(HTRU2)
## # A tibble: 6 × 9
## Mean_IP SD_IP EK_IP SKW_IP Mean_DMSNR SD_DMSNR EK_DMSNR SKW_DMSNR Class
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 141. 55.7 -0.235 -0.700 3.20 19.1 7.98 74.2 0
## 2 103. 58.9 0.465 -0.515 1.68 14.9 10.6 127. 0
## 3 103. 39.3 0.323 1.05 3.12 21.7 7.74 63.2 0
## 4 137. 57.2 -0.0684 -0.636 3.64 21.0 6.90 53.6 0
## 5 88.7 40.7 0.601 1.12 1.18 11.5 14.3 253. 0
## 6 93.6 46.7 0.532 0.417 1.64 14.5 10.6 131. 0
HTRU2 <- HTRU2 %>% mutate(Class=ifelse(Class==0, "Negative", "Positive"))
HTRU2 %>% summarise(
Average = mean(Mean_IP), Medium = median(Mean_IP),
'25%ile' = quantile(Mean_IP, 0.25),
'75%ile' = quantile(Mean_IP, 0.75),
StD = sd(Mean_IP), IQR = IQR(Mean_IP)
)
## # A tibble: 1 × 6
## Average Medium `25%ile` `75%ile` StD IQR
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 111. 115. 101. 127. 25.7 26.2
HTRU2 %>% group_by(Class) %>%
summarise(
Average = mean(Mean_IP), Medium = median(Mean_IP),
'25%ile' = quantile(Mean_IP, 0.25),
'75%ile' = quantile(Mean_IP, 0.75),
StD = sd(Mean_IP), IQR = IQR(Mean_IP)
)
## # A tibble: 2 × 7
## Class Average Medium `25%ile` `75%ile` StD IQR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Negative 117. 117. 105. 128. 17.5 23.0
## 2 Positive 56.7 54.3 31.8 79.3 30.0 47.5
HTRU2 %>% group_by(Class) %>% select(Class, Mean_IP, Mean_DMSNR) %>% summarise_all(list(Avg=mean, Med=median))
## # A tibble: 2 × 5
## Class Mean_IP_Avg Mean_DMSNR_Avg Mean_IP_Med Mean_DMSNR_Med
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Negative 117. 8.86 117. 2.64
## 2 Positive 56.7 49.8 54.3 33.5
HTRU2 %>% group_by(Class) %>% select(Class, Mean_IP, Mean_DMSNR) %>% summarise_all(list(Avg=mean, Med=median)) %>% pivot_longer(cols=c(Mean_IP_Avg, Mean_DMSNR_Avg, Mean_IP_Med, Mean_DMSNR_Med), names_to = "Measure") %>% arrange(desc(Measure))
## # A tibble: 8 × 3
## Class Measure value
## <chr> <chr> <dbl>
## 1 Negative Mean_IP_Med 117.
## 2 Positive Mean_IP_Med 54.3
## 3 Negative Mean_IP_Avg 117.
## 4 Positive Mean_IP_Avg 56.7
## 5 Negative Mean_DMSNR_Med 2.64
## 6 Positive Mean_DMSNR_Med 33.5
## 7 Negative Mean_DMSNR_Avg 8.86
## 8 Positive Mean_DMSNR_Avg 49.8
HTRU2 %>% group_by(Class) %>% select(Class, Mean_IP, Mean_DMSNR) %>% summarise_all(list(Avg=mean, Med=median)) %>% pivot_longer(cols=starts_with("Mean"), names_to = "Measure") %>% arrange(desc(Measure))
## # A tibble: 8 × 3
## Class Measure value
## <chr> <chr> <dbl>
## 1 Negative Mean_IP_Med 117.
## 2 Positive Mean_IP_Med 54.3
## 3 Negative Mean_IP_Avg 117.
## 4 Positive Mean_IP_Avg 56.7
## 5 Negative Mean_DMSNR_Med 2.64
## 6 Positive Mean_DMSNR_Med 33.5
## 7 Negative Mean_DMSNR_Avg 8.86
## 8 Positive Mean_DMSNR_Avg 49.8
HTRU2 %>%
group_by(Class) %>%
select(Class, Mean_IP, Mean_DMSNR) %>%
summarise_all(list(
Avg = mean,
Med = ~median(.),
Q25 = ~quantile(.,probs=c(0.25)),
Q75 = ~quantile(.,0.75)
)) %>%
pivot_longer(cols=starts_with("Mean"), names_to = "Measure") %>%
pivot_wider(id_cols=Measure, names_from=Class) %>%
arrange(desc(Measure))
## # A tibble: 8 × 3
## Measure Negative Positive
## <chr> <dbl> <dbl>
## 1 Mean_IP_Q75 128. 79.3
## 2 Mean_IP_Q25 105. 31.8
## 3 Mean_IP_Med 117. 54.3
## 4 Mean_IP_Avg 117. 56.7
## 5 Mean_DMSNR_Q75 4.23 78.3
## 6 Mean_DMSNR_Q25 1.86 12.8
## 7 Mean_DMSNR_Med 2.64 33.5
## 8 Mean_DMSNR_Avg 8.86 49.8
HTRU2 %>%
group_by(Class) %>%
summarise(Cor_MeanIP_Mean_DMSNR = cor(Mean_IP, Mean_DMSNR))
## # A tibble: 2 × 2
## Class Cor_MeanIP_Mean_DMSNR
## <chr> <dbl>
## 1 Negative 0.117
## 2 Positive -0.542
ggplot(HTRU2, aes(x=Mean_IP, y=Mean_DMSNR, col=Class)) +
geom_point() + facet_wrap(~Class) +
labs(x="Mean IP", y="Mean DMNSR", title="Mean IP vs. Mean DMNSR") +
theme(legend.position = "none") +
geom_smooth(method="lm", col="black")
## `geom_smooth()` using formula 'y ~ x'

HTRU2 <- HTRU2 %>% mutate(Neg_MDMSNR=-Mean_DMSNR)
HTRU2 %>% group_by(Class) %>% summarise(Cor2 = cor(Mean_IP, Neg_MDMSNR))
## # A tibble: 2 × 2
## Class Cor2
## <chr> <dbl>
## 1 Negative -0.117
## 2 Positive 0.542
ggplot(HTRU2,aes(x=Mean_IP,y=Neg_MDMSNR,col=Class)) +
geom_point() + facet_wrap(~Class) +
labs(x="Mean IP", y="Mean DMSNR", title="Mean IP vs. Mean DMSNR") +
theme(legend.position = "none") +
geom_smooth(method="lm",col="black")
## `geom_smooth()` using formula 'y ~ x'

crime <- read_csv(here("BostonCrime/crime.csv"))
## Rows: 327820 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): INCIDENT_NUMBER, OFFENSE_CODE_GROUP, OFFENSE_DESCRIPTION, DISTRICT...
## dbl (7): OFFENSE_CODE, REPORTING_AREA, YEAR, MONTH, HOUR, Lat, Long
## lgl (1): SHOOTING
## dttm (1): OCCURRED_ON_DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(crime)
## # A tibble: 6 × 17
## INCIDENT…¹ OFFEN…² OFFEN…³ OFFEN…⁴ DISTR…⁵ REPOR…⁶ SHOOT…⁷ OCCURRED_ON_DATE
## <chr> <dbl> <chr> <chr> <chr> <dbl> <lgl> <dttm>
## 1 I182080058 2403 Disord… DISTUR… E18 495 FALSE 2018-10-03 20:13:00
## 2 I182080053 3201 Proper… PROPER… D14 795 FALSE 2018-08-30 20:00:00
## 3 I182080052 2647 Other THREAT… B2 329 FALSE 2018-10-03 19:20:00
## 4 I182080051 413 Aggrav… ASSAUL… A1 92 FALSE 2018-10-03 20:00:00
## 5 I182080050 3122 Aircra… AIRCRA… A7 36 FALSE 2018-10-03 20:49:00
## 6 I182080049 1402 Vandal… VANDAL… C11 351 FALSE 2018-10-02 20:40:00
## # … with 9 more variables: YEAR <dbl>, MONTH <dbl>, DAY_OF_WEEK <chr>,
## # HOUR <dbl>, UCR_PART <chr>, STREET <chr>, Lat <dbl>, Long <dbl>,
## # Location <chr>, and abbreviated variable names ¹INCIDENT_NUMBER,
## # ²OFFENSE_CODE, ³OFFENSE_CODE_GROUP, ⁴OFFENSE_DESCRIPTION, ⁵DISTRICT,
## # ⁶REPORTING_AREA, ⁷SHOOTING
names(crime)
## [1] "INCIDENT_NUMBER" "OFFENSE_CODE" "OFFENSE_CODE_GROUP"
## [4] "OFFENSE_DESCRIPTION" "DISTRICT" "REPORTING_AREA"
## [7] "SHOOTING" "OCCURRED_ON_DATE" "YEAR"
## [10] "MONTH" "DAY_OF_WEEK" "HOUR"
## [13] "UCR_PART" "STREET" "Lat"
## [16] "Long" "Location"
str(crime)
## spec_tbl_df [327,820 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ INCIDENT_NUMBER : chr [1:327820] "I182080058" "I182080053" "I182080052" "I182080051" ...
## $ OFFENSE_CODE : num [1:327820] 2403 3201 2647 413 3122 ...
## $ OFFENSE_CODE_GROUP : chr [1:327820] "Disorderly Conduct" "Property Lost" "Other" "Aggravated Assault" ...
## $ OFFENSE_DESCRIPTION: chr [1:327820] "DISTURBING THE PEACE" "PROPERTY - LOST" "THREATS TO DO BODILY HARM" "ASSAULT - AGGRAVATED - BATTERY" ...
## $ DISTRICT : chr [1:327820] "E18" "D14" "B2" "A1" ...
## $ REPORTING_AREA : num [1:327820] 495 795 329 92 36 351 NA 603 543 621 ...
## $ SHOOTING : logi [1:327820] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ OCCURRED_ON_DATE : POSIXct[1:327820], format: "2018-10-03 20:13:00" "2018-08-30 20:00:00" ...
## $ YEAR : num [1:327820] 2018 2018 2018 2018 2018 ...
## $ MONTH : num [1:327820] 10 8 10 10 10 10 10 10 10 10 ...
## $ DAY_OF_WEEK : chr [1:327820] "Wednesday" "Thursday" "Wednesday" "Wednesday" ...
## $ HOUR : num [1:327820] 20 20 19 20 20 20 20 19 19 20 ...
## $ UCR_PART : chr [1:327820] "Part Two" "Part Three" "Part Two" "Part One" ...
## $ STREET : chr [1:327820] "ARLINGTON ST" "ALLSTON ST" "DEVON ST" "CAMBRIDGE ST" ...
## $ Lat : num [1:327820] 42.3 42.4 42.3 42.4 42.4 ...
## $ Long : num [1:327820] -71.1 -71.1 -71.1 -71.1 -71 ...
## $ Location : chr [1:327820] "(42.26260773, -71.12118637)" "(42.35211146, -71.13531147)" "(42.30812619, -71.07692974)" "(42.35945371, -71.05964817)" ...
## - attr(*, "spec")=
## .. cols(
## .. INCIDENT_NUMBER = col_character(),
## .. OFFENSE_CODE = col_double(),
## .. OFFENSE_CODE_GROUP = col_character(),
## .. OFFENSE_DESCRIPTION = col_character(),
## .. DISTRICT = col_character(),
## .. REPORTING_AREA = col_double(),
## .. SHOOTING = col_logical(),
## .. OCCURRED_ON_DATE = col_datetime(format = ""),
## .. YEAR = col_double(),
## .. MONTH = col_double(),
## .. DAY_OF_WEEK = col_character(),
## .. HOUR = col_double(),
## .. UCR_PART = col_character(),
## .. STREET = col_character(),
## .. Lat = col_double(),
## .. Long = col_double(),
## .. Location = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
crime %>% group_by(DAY_OF_WEEK) %>% summarise(count=n()) %>% mutate(prop=count/sum(count))
## # A tibble: 7 × 3
## DAY_OF_WEEK count prop
## <chr> <int> <dbl>
## 1 Friday 49758 0.152
## 2 Monday 46970 0.143
## 3 Saturday 45969 0.140
## 4 Sunday 41374 0.126
## 5 Thursday 47872 0.146
## 6 Tuesday 47726 0.146
## 7 Wednesday 48151 0.147
crime %>% group_by(DAY_OF_WEEK) %>% summarise(count=n()) %>% mutate(prop=count/sum(count)) %>% arrange(desc(count))
## # A tibble: 7 × 3
## DAY_OF_WEEK count prop
## <chr> <int> <dbl>
## 1 Friday 49758 0.152
## 2 Wednesday 48151 0.147
## 3 Thursday 47872 0.146
## 4 Tuesday 47726 0.146
## 5 Monday 46970 0.143
## 6 Saturday 45969 0.140
## 7 Sunday 41374 0.126
crime %>%
group_by(MONTH) %>%
summarise(count=n()) %>%
mutate(prop=count/sum(count)) %>%
arrange(MONTH)
## # A tibble: 12 × 3
## MONTH count prop
## <dbl> <int> <dbl>
## 1 1 23625 0.0721
## 2 2 21661 0.0661
## 3 3 24156 0.0737
## 4 4 24108 0.0735
## 5 5 26242 0.0801
## 6 6 30622 0.0934
## 7 7 34640 0.106
## 8 8 35137 0.107
## 9 9 34023 0.104
## 10 10 26437 0.0806
## 11 11 23685 0.0723
## 12 12 23484 0.0716
month.abb
## [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
crime <- crime %>% mutate(Month = month.abb[MONTH])
crime %>% select(MONTH, Month) %>% slice(1:5)
## # A tibble: 5 × 2
## MONTH Month
## <dbl> <chr>
## 1 10 Oct
## 2 8 Aug
## 3 10 Oct
## 4 10 Oct
## 5 10 Oct
crime %>% group_by(Month) %>% summarise(count=n()) %>%
mutate(prop = count/sum(count)) %>% arrange(Month)
## # A tibble: 12 × 3
## Month count prop
## <chr> <int> <dbl>
## 1 Apr 24108 0.0735
## 2 Aug 35137 0.107
## 3 Dec 23484 0.0716
## 4 Feb 21661 0.0661
## 5 Jan 23625 0.0721
## 6 Jul 34640 0.106
## 7 Jun 30622 0.0934
## 8 Mar 24156 0.0737
## 9 May 26242 0.0801
## 10 Nov 23685 0.0723
## 11 Oct 26437 0.0806
## 12 Sep 34023 0.104
courses = c("MATH 203", "MATH 204", "MATH 208", "MATH 324",
"MATH 423", "MATH 447","MATH 523", "MATH 525",
"MATH 533", "MATH 545")
class(courses)
## [1] "character"
courses_fct = factor(courses)
mode(courses_fct)
## [1] "numeric"
attributes(courses_fct)
## $levels
## [1] "MATH 203" "MATH 204" "MATH 208" "MATH 324" "MATH 423" "MATH 447"
## [7] "MATH 523" "MATH 525" "MATH 533" "MATH 545"
##
## $class
## [1] "factor"
crime <- crime %>%
mutate(Day_of_week=fct_relevel(DAY_OF_WEEK, c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
crime %>% group_by(Day_of_week) %>% summarize(count=n()) %>% mutate(prop=count/sum(count))
## # A tibble: 7 × 3
## Day_of_week count prop
## <fct> <int> <dbl>
## 1 Monday 46970 0.143
## 2 Tuesday 47726 0.146
## 3 Wednesday 48151 0.147
## 4 Thursday 47872 0.146
## 5 Friday 49758 0.152
## 6 Saturday 45969 0.140
## 7 Sunday 41374 0.126
crime <- crime %>% mutate(Month = fct_relevel(Month, month.abb))
crime_by_month = crime %>% group_by(Month) %>% summarise(count=n()) %>% mutate(prop=count/sum(count)) %>% arrange(Month)
crime_by_month
## # A tibble: 12 × 3
## Month count prop
## <fct> <int> <dbl>
## 1 Jan 23625 0.0721
## 2 Feb 21661 0.0661
## 3 Mar 24156 0.0737
## 4 Apr 24108 0.0735
## 5 May 26242 0.0801
## 6 Jun 30622 0.0934
## 7 Jul 34640 0.106
## 8 Aug 35137 0.107
## 9 Sep 34023 0.104
## 10 Oct 26437 0.0806
## 11 Nov 23685 0.0723
## 12 Dec 23484 0.0716
ggplot(crime_by_month, aes(x="", y=count, fill=Month)) +
geom_bar(stat="identity") +
coord_polar("y", start=0)

ggplot(crime_by_month, aes(x="", y=prop, fill=Month)) +
geom_bar(stat="identity") +
coord_polar("y", start=0)

ggplot(crime_by_month, aes(x="", y=count, fill=Month)) +
geom_bar(stat="identity") +
coord_polar("y", start=0) +
scale_fill_viridis_d()

ggplot(crime, aes(x=Month, fill=Month)) +
geom_bar() +
scale_fill_viridis_d() +
ylab("Total number of crimes")

ggplot(crime, aes(x=Month, fill=Month)) +
geom_bar(aes(y=..count../sum(..count..))) +
scale_fill_viridis_d() +
ylab("Total number of crimes")

crime %>% group_by(OFFENSE_CODE_GROUP) %>%
summarise(count=n()) %>%
mutate(prop=count/sum(count)) %>%
arrange(desc(prop))
## # A tibble: 67 × 3
## OFFENSE_CODE_GROUP count prop
## <chr> <int> <dbl>
## 1 Motor Vehicle Accident Response 38134 0.116
## 2 Larceny 26670 0.0814
## 3 Medical Assistance 24226 0.0739
## 4 Investigate Person 19176 0.0585
## 5 Other 18612 0.0568
## 6 Drug Violation 17037 0.0520
## 7 Simple Assault 16263 0.0496
## 8 Vandalism 15810 0.0482
## 9 Verbal Disputes 13478 0.0411
## 10 Towed 11632 0.0355
## # … with 57 more rows
off_code_counts <- crime %>%
group_by(OFFENSE_CODE_GROUP) %>%
summarise(count=n()) %>%
mutate(prop=count/sum(count))
ggplot(off_code_counts, aes(x=OFFENSE_CODE_GROUP, fill=OFFENSE_CODE_GROUP)) +
geom_bar(stat='identity', aes(y=prop)) + scale_fill_viridis_d() + ylab("Proportion of crime")

crime <- crime %>%
mutate(code_lmp=fct_lump(OFFENSE_CODE_GROUP, 12))
head(crime)
## # A tibble: 6 × 20
## INCIDENT…¹ OFFEN…² OFFEN…³ OFFEN…⁴ DISTR…⁵ REPOR…⁶ SHOOT…⁷ OCCURRED_ON_DATE
## <chr> <dbl> <chr> <chr> <chr> <dbl> <lgl> <dttm>
## 1 I182080058 2403 Disord… DISTUR… E18 495 FALSE 2018-10-03 20:13:00
## 2 I182080053 3201 Proper… PROPER… D14 795 FALSE 2018-08-30 20:00:00
## 3 I182080052 2647 Other THREAT… B2 329 FALSE 2018-10-03 19:20:00
## 4 I182080051 413 Aggrav… ASSAUL… A1 92 FALSE 2018-10-03 20:00:00
## 5 I182080050 3122 Aircra… AIRCRA… A7 36 FALSE 2018-10-03 20:49:00
## 6 I182080049 1402 Vandal… VANDAL… C11 351 FALSE 2018-10-02 20:40:00
## # … with 12 more variables: YEAR <dbl>, MONTH <dbl>, DAY_OF_WEEK <chr>,
## # HOUR <dbl>, UCR_PART <chr>, STREET <chr>, Lat <dbl>, Long <dbl>,
## # Location <chr>, Month <fct>, Day_of_week <fct>, code_lmp <fct>, and
## # abbreviated variable names ¹INCIDENT_NUMBER, ²OFFENSE_CODE,
## # ³OFFENSE_CODE_GROUP, ⁴OFFENSE_DESCRIPTION, ⁵DISTRICT, ⁶REPORTING_AREA,
## # ⁷SHOOTING
off_code_counts_lmp <- crime %>%
group_by(code_lmp) %>%
count() %>%
ungroup() %>%
mutate(prop=n/sum(n)) %>%
arrange(n)
off_code_counts_lmp
## # A tibble: 12 × 3
## code_lmp n prop
## <fct> <int> <dbl>
## 1 Larceny From Motor Vehicle 11120 0.0339
## 2 Investigate Property 11443 0.0349
## 3 Towed 11632 0.0355
## 4 Verbal Disputes 13478 0.0411
## 5 Vandalism 15810 0.0482
## 6 Simple Assault 16263 0.0496
## 7 Drug Violation 17037 0.0520
## 8 Investigate Person 19176 0.0585
## 9 Medical Assistance 24226 0.0739
## 10 Larceny 26670 0.0814
## 11 Motor Vehicle Accident Response 38134 0.116
## 12 Other 122831 0.375
ggplot(off_code_counts_lmp, aes(x=code_lmp, fill=code_lmp)) + geom_bar(stat="identity", aes(y=prop)) + scale_fill_viridis_d() + ylab("Proportion of crimes")

off_code_counts_lmp <- off_code_counts_lmp %>%
mutate(code_lmp = fct_reorder(code_lmp, n,.desc=TRUE))
ggplot(off_code_counts_lmp,aes(x=code_lmp,fill=code_lmp)) +
geom_bar(stat="identity",aes(y=prop)) +
scale_fill_viridis_d() +
ylab("Proportion of crimes")

library(treemapify)
ggplot(off_code_counts_lmp, aes(area=n, fill=code_lmp)) + geom_treemap() + scale_fill_viridis_d()

off_code_counts <- off_code_counts %>% mutate(OFFENSE_CODE_GROUP = fct_reorder(OFFENSE_CODE_GROUP,count, .desc=TRUE))
p <- ggplot(off_code_counts, aes(area=count, fill=OFFENSE_CODE_GROUP)) + geom_treemap()
class(p)
## [1] "gg" "ggplot"
attributes(p)
## $names
## [1] "data" "layers" "scales" "mapping" "theme"
## [6] "coordinates" "facet" "plot_env" "labels"
##
## $class
## [1] "gg" "ggplot"
print(p)

p + theme(legend.position = "none")

library(ggpubr)
as_ggplot(get_legend(p+theme(legend.text=element_text(size=8))))

crime = crime %>% mutate(code_lmp = fct_infreq(code_lmp))
ggplot(crime, aes(x=Month, fill=code_lmp)) + geom_bar() + scale_fill_viridis_d()

ggplot(crime, aes(x=code_lmp, fill=code_lmp)) +
geom_bar(position="dodge")+facet_wrap(~Month) +
scale_fill_viridis_d() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)

library(ggmosaic)
ggplot(crime) + geom_mosaic(aes(x=product(code_lmp, Day_of_week), fill=Day_of_week)) +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## Please use `unite()` instead.

ggplot(crime) + geom_mosaic(aes(x=product(Day_of_week, code_lmp), fill=code_lmp)) +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())

crime %>% select(OCCURRED_ON_DATE) %>% head(20)
## # A tibble: 20 × 1
## OCCURRED_ON_DATE
## <dttm>
## 1 2018-10-03 20:13:00
## 2 2018-08-30 20:00:00
## 3 2018-10-03 19:20:00
## 4 2018-10-03 20:00:00
## 5 2018-10-03 20:49:00
## 6 2018-10-02 20:40:00
## 7 2018-10-03 20:16:00
## 8 2018-10-03 19:32:00
## 9 2018-10-03 19:27:00
## 10 2018-10-03 20:00:00
## 11 2018-10-03 19:33:00
## 12 2018-10-01 20:00:00
## 13 2018-10-03 17:18:00
## 14 2018-10-03 08:00:00
## 15 2018-10-03 19:58:00
## 16 2018-10-03 19:30:00
## 17 2018-10-03 18:35:00
## 18 2018-10-03 19:56:00
## 19 2018-10-03 18:41:00
## 20 2018-10-03 18:18:00
crime %>% summarise(min=min(OCCURRED_ON_DATE), med = median(OCCURRED_ON_DATE), max = max(OCCURRED_ON_DATE))
## # A tibble: 1 × 3
## min med max
## <dttm> <dttm> <dttm>
## 1 2015-06-15 00:00:00 2017-02-14 15:49:00 2018-10-03 20:49:00
crime %>% pull(OCCURRED_ON_DATE) %>% class(.)
## [1] "POSIXct" "POSIXt"
crime %>% pull(OCCURRED_ON_DATE) %>% head(20) %>% as.numeric(.)
## [1] 1538597580 1535659200 1538594400 1538596800 1538599740 1538512800
## [7] 1538597760 1538595120 1538594820 1538596800 1538595180 1538424000
## [13] 1538587080 1538553600 1538596680 1538595000 1538591700 1538596560
## [19] 1538592060 1538590680
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
my_date <- "2003-05-29"
class(my_date)
## [1] "character"
my_date <- ymd(my_date)
class(my_date)
## [1] "Date"
other_date = ymd_hms("2009-05-02 02:57:00", tz="America/Montreal")
other_date
## [1] "2009-05-02 02:57:00 EDT"
my_date + days(-2:2)
## [1] "2003-05-27" "2003-05-28" "2003-05-29" "2003-05-30" "2003-05-31"
my_date + months(-1:1)
## [1] "2003-04-29" "2003-05-29" "2003-06-29"
now()
## [1] "2022-10-06 17:47:11 EDT"
today() - my_date
## Time difference of 7070 days
interval(other_date, now()) / years(1)
## [1] 13.43183
by_date_tbl = crime %>% mutate(date_only = date(OCCURRED_ON_DATE)) %>%
group_by(date_only) %>%
summarise(count=n())
by_date_tbl %>% arrange(desc(count)) %>% head(5)
## # A tibble: 5 × 2
## date_only count
## <date> <int>
## 1 2016-09-01 379
## 2 2017-09-01 377
## 3 2018-06-15 376
## 4 2017-09-22 369
## 5 2017-08-04 361
ggplot(by_date_tbl, aes(x=date_only, y=count)) + geom_line() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

by_month_tbl = crime %>% group_by(YEAR,Month) %>%
summarise(count=n())
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
by_month_tbl %>% arrange(desc(count)) %>% head(5)
## # A tibble: 5 × 3
## # Groups: YEAR [2]
## YEAR Month count
## <dbl> <fct> <int>
## 1 2017 Aug 9209
## 2 2017 Jul 9077
## 3 2017 Jun 8990
## 4 2017 Sep 8950
## 5 2016 Aug 8940
by_month_tbl = by_month_tbl %>% ungroup() %>% mutate(Month_Year=factor(interaction(Month, YEAR)))
Jan_levels = by_month_tbl %>% filter(Month=="Jan") %>% pull(Month_Year) %>% unique(.)
tibble(Jan_levels, as.numeric(Jan_levels))
## # A tibble: 3 × 2
## Jan_levels `as.numeric(Jan_levels)`
## <fct> <dbl>
## 1 Jan.2016 8
## 2 Jan.2017 20
## 3 Jan.2018 32
ggplot(by_month_tbl,aes(x=Month_Year,y=count, group=1)) +
geom_point() + geom_line() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x="Date",y="Count",title="Number of crimes by month") +
geom_vline(xintercept=as.numeric(Jan_levels),
col="red",linetype="dashed")

crime = crime %>% mutate(First_of_month = floor_date(OCCURRED_ON_DATE, "month"))
crime %>% slice(sample(x=1:n(),size=10)) %>%
select(OCCURRED_ON_DATE,First_of_month)
## # A tibble: 10 × 2
## OCCURRED_ON_DATE First_of_month
## <dttm> <dttm>
## 1 2017-03-22 16:37:00 2017-03-01 00:00:00
## 2 2015-12-28 10:18:00 2015-12-01 00:00:00
## 3 2017-02-20 19:11:00 2017-02-01 00:00:00
## 4 2017-01-18 15:36:00 2017-01-01 00:00:00
## 5 2016-07-27 20:30:00 2016-07-01 00:00:00
## 6 2016-06-03 22:42:00 2016-06-01 00:00:00
## 7 2016-04-29 11:23:00 2016-04-01 00:00:00
## 8 2017-08-03 10:04:00 2017-08-01 00:00:00
## 9 2017-08-04 19:33:00 2017-08-01 00:00:00
## 10 2017-09-24 18:00:00 2017-09-01 00:00:00
by_month_tbl2 = crime %>% group_by(First_of_month) %>%
summarise(count=n())
ggplot(by_month_tbl2,aes(x=First_of_month,y=count)) +
geom_point() + geom_line() +
labs(x="Date",y="Count",title="Number of crimes by month") +
geom_vline(xintercept=
as.POSIXct(c("2016-01-01","2017-01-01","2018-01-01")),
col="red",linetype="dashed")

1/0
## [1] Inf
exp(-Inf)
## [1] 0
0/0
## [1] NaN
sqrt(-1)
## Warning in sqrt(-1): NaNs produced
## [1] NaN
sqrt(as.complex(-1))
## [1] 0+1i
c(1,2,3,-Inf) + c(NA, Inf, NaN, Inf)
## [1] NA Inf NaN NaN
as.numeric("My Missing Value")
## Warning: NAs introduced by coercion
## [1] NA
as.numeric(factor("My Missing Value"))
## [1] 1
c(1, "3")
## [1] "1" "3"
as.numeric(c(1, "3"))
## [1] 1 3
crime %>% summarise_all(list(~sum(is.na(.)))) %>% pivot_longer(cols=everything(), names_to = "Variable")
## # A tibble: 21 × 2
## Variable value
## <chr> <int>
## 1 INCIDENT_NUMBER 0
## 2 OFFENSE_CODE 0
## 3 OFFENSE_CODE_GROUP 0
## 4 OFFENSE_DESCRIPTION 0
## 5 DISTRICT 1774
## 6 REPORTING_AREA 20920
## 7 SHOOTING 0
## 8 OCCURRED_ON_DATE 0
## 9 YEAR 0
## 10 MONTH 0
## # … with 11 more rows
crime %>% summarise_all(list(~sum(is.na(.)))) %>% pivot_longer(cols=everything(), names_to = "Variable") %>% filter(value>0)
## # A tibble: 6 × 2
## Variable value
## <chr> <int>
## 1 DISTRICT 1774
## 2 REPORTING_AREA 20920
## 3 UCR_PART 93
## 4 STREET 10977
## 5 Lat 20632
## 6 Long 20632
crime_no_na <- crime %>% drop_na()
crime %>% summarise(n())
## # A tibble: 1 × 1
## `n()`
## <int>
## 1 327820
crime_no_na %>% summarise(n())
## # A tibble: 1 × 1
## `n()`
## <int>
## 1 304167
crime %>% drop_na(UCR_PART) %>% summarise(n())
## # A tibble: 1 × 1
## `n()`
## <int>
## 1 327727
crime %>% pull(code_lmp) %>% unique(.) %>% sort(.)
## [1] Other Motor Vehicle Accident Response
## [3] Larceny Medical Assistance
## [5] Investigate Person Drug Violation
## [7] Simple Assault Vandalism
## [9] Verbal Disputes Towed
## [11] Investigate Property Larceny From Motor Vehicle
## 12 Levels: Other Motor Vehicle Accident Response Larceny ... Larceny From Motor Vehicle
crime = crime %>%
mutate(
code_lmp_alt = recode(code_lmp,
'Investigate Person' = "Investigate",
'Investigate Property' = "Investigate",
'Motor Vehicle Accident Response' = "Motor Vehicle",
'Larceny From Motor Vehicle' = "Motor Vehicle"
)
)
crime %>% group_by(code_lmp_alt) %>% summarise(count=n()) %>% arrange(desc(count))
## # A tibble: 10 × 2
## code_lmp_alt count
## <fct> <int>
## 1 Other 122831
## 2 Motor Vehicle 49254
## 3 Investigate 30619
## 4 Larceny 26670
## 5 Medical Assistance 24226
## 6 Drug Violation 17037
## 7 Simple Assault 16263
## 8 Vandalism 15810
## 9 Verbal Disputes 13478
## 10 Towed 11632
crime %>% group_by(Location) %>% summarise(count=n()) %>% arrange(desc(count))
## # A tibble: 18,255 × 2
## Location count
## <chr> <int>
## 1 (0.00000000, 0.00000000) 20632
## 2 (42.34862382, -71.08277637) 1276
## 3 (42.36183857, -71.05976489) 1248
## 4 (42.28482577, -71.09137369) 1137
## 5 (42.32866284, -71.08563401) 1075
## 6 (42.25621592, -71.12401947) 916
## 7 (42.29755533, -71.05970910) 794
## 8 (42.34128751, -71.05467933) 786
## 9 (-1.00000000, -1.00000000) 775
## 10 (42.33152148, -71.07085307) 760
## # … with 18,245 more rows
crime = crime %>%
mutate(
Location_alt = fct_recode(Location,
NULL = "(-1.00000000, -1.00000000)",
NULL = "(0.00000000, 0.00000000)"
)
)
crime %>% group_by(Location_alt) %>% summarise(count=n()) %>% arrange(desc(count))
## # A tibble: 18,254 × 2
## Location_alt count
## <fct> <int>
## 1 <NA> 21407
## 2 (42.34862382, -71.08277637) 1276
## 3 (42.36183857, -71.05976489) 1248
## 4 (42.28482577, -71.09137369) 1137
## 5 (42.32866284, -71.08563401) 1075
## 6 (42.25621592, -71.12401947) 916
## 7 (42.29755533, -71.05970910) 794
## 8 (42.34128751, -71.05467933) 786
## 9 (42.33152148, -71.07085307) 760
## 10 (42.35231190, -71.06370510) 707
## # … with 18,244 more rows